library(pROC)
library(ggpubr)
library(Biostrings)
## Warning: package 'BiocGenerics' was built under R version 4.0.5
library(data.table)
library(dplyr)
library(PepTools)
library(cowplot)

library(rstatix)

library(purrr)
library(tidyverse)
library(yardstick)
library(doParallel)
library(foreach)
library(stringdist)
library(caret)
library(elucidate)

1 Introduction

1.1 useful Functions

# Function to provide a closest match. Used to match HLA Alleles across mixed output styles.
ClosestMatch2 = function(string, stringVector){

  stringVector[amatch(string, stringVector, maxDist=Inf)]

}

2 Pathogenics vs cancer

2.1 Prepare TESLA dataset

library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
TESLA= fread("TESLA_DATASET_608.csv")

TESLA=TESLA %>% separate(col=PMHC, into = c("HLA_Allele","Peptide"),sep="_")%>% mutate(HLA_Allele = paste0("HLA-",HLA_Allele))%>%
  mutate(Length = nchar(Peptide))%>% mutate(VALIDATED = ifelse(VALIDATED==FALSE,"Negative","Positive"))%>% dplyr::rename(Immunogenicity=VALIDATED)

TESLA=TESLA %>% mutate(nM = NETMHC_PAN_BINDING_AFFINITY) %>% mutate("Thalf(h)" = BINDING_STABILITY) %>% mutate(HydroFraction = FRAC_HYDROPHOBIC)

TESLA %>% glimpse()
## Rows: 608
## Columns: 26
## $ HLA_Allele                  <chr> "HLA-A*01:01", "HLA-A*01:01", "HLA-A*01:01…
## $ Peptide                     <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", …
## $ PATIENT_ID                  <int> 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, …
## $ TISSUE_TYPE                 <chr> "PBMC", "PBMC", "PBMC", "PBMC", "PBMC", "P…
## $ MHC                         <chr> "A*01:01", "A*01:01", "A*01:01", "A*01:01"…
## $ ALT_EPI_SEQ                 <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", …
## $ PEP_LEN                     <int> 9, 10, 11, 9, 9, 10, 9, 9, 8, 9, 8, 9, 10,…
## $ MEASURED_BINDING_AFFINITY   <dbl> 45, 33, 77, 0, 3, 1320, NA, 354, 30, 9, NA…
## $ NETMHC_PAN_BINDING_AFFINITY <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.…
## $ TUMOR_ABUNDANCE             <dbl> 57.819051, 51.857250, 30.389590, 159.96769…
## $ BINDING_STABILITY           <dbl> 0.76, 2.20, 2.24, 2.28, 1.21, 2.80, 1.82, …
## $ FRAC_HYDROPHOBIC            <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667…
## $ AGRETOPICITY                <dbl> 0.019019295, 0.838764951, 0.010851108, 1.2…
## $ FOREIGNNESS                 <dbl> 0.000000e+00, 0.000000e+00, 9.280000e-20, …
## $ MUTATION_POSITION           <int> 2, 3, 11, 5, 8, 10, 9, 8, 7, 1, 2, 1, 1, 7…
## $ NUMBER_PREDICTING           <int> 11, 10, 8, 13, 13, 8, 5, 12, 6, 8, 6, 16, …
## $ Immunogenicity              <chr> "Negative", "Negative", "Negative", "Negat…
## $ TCR_FLOW_I                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ TCR_FLOW_I_QUANT            <dbl> 0.0000, 0.0000, 0.0011, 0.0000, 0.0000, 0.…
## $ TCR_NANOPARTICLE            <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ TCR_FLOW_II                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ TCR_FLOW_II_QUANT           <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, …
## $ Length                      <int> 9, 10, 11, 9, 9, 10, 9, 9, 8, 9, 8, 9, 10,…
## $ nM                          <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.…
## $ `Thalf(h)`                  <dbl> 0.76, 2.20, 2.24, 2.28, 1.21, 2.80, 1.82, …
## $ HydroFraction               <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667…

2.2 Read in pathogenic peptides data

2.2.1 Criteria

  • Only one peptide,imm,hla observation
  • Excluded any odd looking peptides
  • Filter for 9/10mers
  • Do a little cleaning
  • Filter for specific alleles i,.e those which stabpan can process from HLAs A,B,C
  • After applying these filters there are peptides from a vast array of HLAs
  • same lengths as TESLA
FullDataset=readRDS(file="Pathogenic_ClassI_peptides.rds")

2.3 Prepare pathogenic dataset

2.3.1 Run netMHCpan 4.0

  • Generate nM prediction
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_NETMHCPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
    LENGTHS=FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
    testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
    write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
    RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
    system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))

}

2.3.2 Read in netMHCpan results

  • So to compute nM for each peptide
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_NETMHCPAN/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))

Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))

FullDataset=FullDataset %>% inner_join(Netmhcpanres %>% select(! c(file,Pos,ID,core,icore))) %>% mutate(Binder = ifelse(NB==1,"BINDER","NONBINDER"))
## Joining, by = c("Peptide", "HLA_Allele")
FullDataset %>% nrow
## [1] 32698

2.3.3 Filter only for binders for susbequent analysis

  • Left with 24924 22717 (5772 and 25% immunogenic)
FullDataset=FullDataset %>% filter(Binder ==  'BINDER')
FullDataset %>% nrow
## [1] 22717
FullDataset %>% select(Immunogenicity) %>% table
## .
## Negative Positive 
##    16945     5772
FullDataset %>% select(Immunogenicity) %>% table %>% prop.table()
## .
##  Negative  Positive 
## 0.7459172 0.2540828

2.3.4 Run netmhc stabpan

  • Generate a binding stability prediction

TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_STABPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
    DATASET = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
    for(i in 1:length(unique(DATASET$Length))){
        LENGTH = unique(DATASET$Length)[i]
        testdata=paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
        write.table(DATASET %>% filter(Length == LENGTH) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
    # Run model
        RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
        system(paste0("/Applications/netMHCstabpan-1.0/netMHCstabpan -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",LENGTH," -xls -xlsfile ", RESULTS_OUTPUT))
    }
}

2.3.5 Read in predictions from stab pan

TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_STABPAN/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Extract HLA and clean
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$HLA_Allele,pattern="LENGTH_[0-9]*_",replacement = ""))
# map HLA nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))

Netmhcpanres=Netmhcpanres %>% select(Peptide, "Thalf(h)",HLA_Allele)

FullDataset=FullDataset %>% inner_join(Netmhcpanres, by=c("Peptide","HLA_Allele"))
# 22717 obs so joining worked correctly.
FullDataset %>% nrow
## [1] 22717

2.3.6 Compute hydrophobic fraction

library(Peptides)

library(stringi)

HYDROPHOBIC_RESIDUES = c("V","I","L","F","M","W","C") # from BARNES 2003 (SEE WELLS PAPER)

FullDataset=FullDataset %>% mutate(HydrophobicCount =  stri_count_regex(FullDataset$Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>% mutate(HydroFraction = HydrophobicCount/Length)

2.3.7 Finalise TESLA vs Pathogenic data for analysis

TESLA = TESLA %>% mutate(Dataset = "Cancer")
FullDataset = FullDataset%>% mutate(Dataset = "Pathogenic")
TESLASubset=TESLA %>% select(Peptide,Immunogenicity, HLA_Allele, nM, "Thalf(h)", HydroFraction, Dataset)
PathogenicSubset=FullDataset%>% select(Peptide,Immunogenicity, HLA_Allele, nM, "Thalf(h)", HydroFraction, Dataset)

combinedDataset = rbind(TESLASubset,PathogenicSubset)

combinedDataset %>% select(Dataset,Immunogenicity)%>% table
##             Immunogenicity
## Dataset      Negative Positive
##   Cancer          571       37
##   Pathogenic    16945     5772
combinedDataset =combinedDataset%>% mutate(Dataset = factor(Dataset, levels = c("Pathogenic","Cancer")))

2.4 Results

2.4.1 Affinity comparison between TESLA vs pathogenic data

BA_PATHTESLA_DENSITY_PLT= combinedDataset %>% ggplot(aes(x=nM, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Affinity\n(nM)")

mycomparisons =list(c("Positive","Negative"))

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(nM))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(nM))


BA_PATHTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="nM",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Binding Affinity\n(nM)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

BA_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=nM, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(nM)")+theme_pubr(base_size = 16)

BA_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(nM, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
  TESLASubset %>% select(nM, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=nM, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Affinity\n(nM)")
plot_grid(BA_PATHTESLA_DENSITY_PLT, BA_PATHTESLA_BOX_PLT+theme(legend.position = "none"),BA_PATHTESLA_VIOLIN_PLT, BA_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

2.4.2 Stability

  • Next we look into stability in hrs between groups
STAB_PATHTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=`Thalf(h)`, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Stability\n(hrs)")

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(`Thalf(h)`))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(`Thalf(h)`))


STAB_PATHTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="Thalf(h)",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

STAB_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=`Thalf(h)`, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+theme_pubr(base_size = 16)
STAB_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(`Thalf(h)`, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
  TESLASubset %>% select(`Thalf(h)`, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=`Thalf(h)`, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Stability\n(hrs)")
library(cowplot)

plot_grid(STAB_PATHTESLA_DENSITY_PLT, STAB_PATHTESLA_BOX_PLT+theme(legend.position = "none"),STAB_PATHTESLA_VIOLIN_PLT, STAB_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

2.4.3 Fraction Hydrophobic

HYDRO_PATHTESLA_DENSITY_PLT= combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct() %>% ggplot(aes(x=HydroFraction, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+xlab("Fraction Hydrophobic")

MEDIAN_DATA_TESLA_NEG = combinedDataset%>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct() %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(HydroFraction))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(HydroFraction))

HYDRO_PATHTESLA_VIOLIN_PLT=combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% ggboxplot(x="Immunogenicity",y="HydroFraction")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center",comparisons = mycomparisons)+ylab("Fraction Hydrophobic")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

HYDRO_PATHTESLA_BOX_PLT=combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% ggplot(aes(x=Immunogenicity, y=HydroFraction, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic\n ")+theme_pubr(base_size = 16)

HYDRO_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(HydroFraction, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>% distinct()%>%rbind(
  TESLASubset %>% select(HydroFraction, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")%>% distinct()
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=HydroFraction, color=Dataset))+stat_ecdf(size=2)+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Fraction Hydrophobic")
library(cowplot)

plot_grid(HYDRO_PATHTESLA_DENSITY_PLT+rotate_x_text(angle=90), HYDRO_PATHTESLA_BOX_PLT+theme(legend.position = "none"),HYDRO_PATHTESLA_VIOLIN_PLT, HYDRO_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

3 Compare GBM w/ TESLA

3.1 Prepare datasets for comparison

3.1.1 GBM

  • TESLA already ready. So read in GBM and compute nM, binding stability and fraction hydrophobicity.
  • Read in GBM dataset
  • Class the contradictory peptide as positive..
  • Filter for 9 and 10mers
  • leaves 123 peptides in total. 20% are immunogenic
FullDataset = readRDS("GBM_test_data.rds")

3.1.1.1 Run netMHCpan for GBM

  • Generate binding affinity estimate
TEST_DATA_LOCATION = "GBM_EPITOPES_NETMHCPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
    LENGTHS=FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
    testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
    write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
    RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
    system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))
}
3.1.1.1.1 Read in
TEST_DATA_LOCATION = "GBM_EPITOPES_NETMHCPAN/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))

Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))

FullDataset=FullDataset %>% inner_join(Netmhcpanres %>% select(! c(file,Pos,ID,core,icore))) %>% mutate(Binder = ifelse(NB==1,"BINDER","NONBINDER"))
## Joining, by = c("Peptide", "HLA_Allele")
FullDataset %>% nrow
## [1] 123

3.1.1.2 run netmhcstabpan

  • generate stability predictions

TEST_DATA_LOCATION = "GBM_EPITOPES_STABPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
    DATASET = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
    for(i in 1:length(unique(DATASET$Length))){
        LENGTH = unique(DATASET$Length)[i]
        testdata=paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
        write.table(DATASET %>% filter(Length == LENGTH) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
        RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
        system(paste0("/Applications/netMHCstabpan-1.0/netMHCstabpan -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",LENGTH," -xls -xlsfile ", RESULTS_OUTPUT))
    }
}

3.1.1.2.1 Read in
TEST_DATA_LOCATION = "GBM_EPITOPES_STABPAN/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$HLA_Allele,pattern="LENGTH_[0-9]*_",replacement = ""))

Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))

Netmhcpanres=Netmhcpanres %>% select(Peptide, "Thalf(h)",HLA_Allele)

FullDataset=FullDataset %>% inner_join(Netmhcpanres, by=c("Peptide"))
FullDataset %>% glimpse()
## Rows: 123
## Columns: 12
## $ Norm_peptide   <chr> ".........R", ".P.......", ".P........", ".P.......", "…
## $ Peptide        <chr> "FLEEIILKSL", "FLRESQNPL", "GLALGTPLSI", "GLAVNLSQI", "…
## $ HLA_Allele.x   <chr> "HLA-A02:01", "HLA-A02:01", "HLA-A02:01", "HLA-A02:01",…
## $ Immunogenicity <chr> "Negative", "Negative", "Negative", "Negative", "Negati…
## $ `1-log50k`     <dbl> 0.6756, 0.6498, 0.4990, 0.5065, 0.3949, 0.4457, 0.4416,…
## $ nM             <dbl> 33.4288, 44.2217, 226.0590, 208.4324, 696.8362, 402.439…
## $ Rank           <dbl> 0.4449, 0.5677, 1.7764, 1.6901, 3.3725, 2.4937, 2.5586,…
## $ Ave            <dbl> 0.6756, 0.6498, 0.4990, 0.5065, 0.3949, 0.4457, 0.4416,…
## $ NB             <int> 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1…
## $ Binder         <chr> "BINDER", "BINDER", "BINDER", "BINDER", "NONBINDER", "N…
## $ `Thalf(h)`     <dbl> 1.6064, 0.7949, 1.4697, 3.3911, 0.1720, 1.7038, 0.5679,…
## $ HLA_Allele.y   <chr> "HLA-A02:01", "HLA-A02:01", "HLA-A02:01", "HLA-A02:01",…

3.1.2 Compte fraction hydrophobicity

FullDataset=FullDataset%>% mutate(Length = nchar(Peptide)) %>% mutate(HydrophobicCount =  stri_count_regex(FullDataset$Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>% mutate(HydroFraction = HydrophobicCount/Length)

TESLASubset=TESLA%>% mutate(Dataset="TESLA") %>% select(Peptide,Immunogenicity, nM, "Thalf(h)", HydroFraction, Dataset)
GBMSubset=FullDataset%>% mutate(Dataset = "GBM")%>% select(Peptide,Immunogenicity, nM, "Thalf(h)", HydroFraction, Dataset)

combinedDataset = rbind(TESLASubset,GBMSubset)
combinedDataset%>% glimpse()
## Rows: 731
## Columns: 6
## $ Peptide        <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", "CVDWLIAVY", …
## $ Immunogenicity <chr> "Negative", "Negative", "Negative", "Negative", "Negati…
## $ nM             <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.7, 88.3, 29.1…
## $ `Thalf(h)`     <dbl> 0.76, 2.20, 2.24, 2.28, 1.21, 2.80, 1.82, 2.22, 2.90, 1…
## $ HydroFraction  <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667, 0.2222222, …
## $ Dataset        <chr> "TESLA", "TESLA", "TESLA", "TESLA", "TESLA", "TESLA", "…

3.2 Results

3.2.1 Affinity

BA_GBMTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=nM, fill=Dataset))+geom_density(aes(y=..density..),alpha=0.4, bins=10,position = "identity")+facet_wrap(~Immunogenicity)+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Affinity\n(nM)")
## Warning: Ignoring unknown parameters: bins
BA_GBMTESLA_DENSITY_PLT

mycomparisons =list(c("Positive","Negative"))

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(nM))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(nM))


BA_GBMTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="nM",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(nM)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)


BA_GBMTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=nM, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(nM)")+theme_pubr(base_size = 16)
library(cowplot)

plot_grid(BA_GBMTESLA_DENSITY_PLT,BA_GBMTESLA_BOX_PLT+theme(legend.position = "none"), BA_GBMTESLA_VIOLIN_PLT,nrow=1,align="hv", rel_widths = c(1,0.8,1,1.2),axis="bl")
## Warning: Removed 2 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_hline).

3.2.2 Stability

STAB_GBMTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=`Thalf(h)`, fill=Dataset))+geom_density(aes(y=..density..),alpha=0.4, bins=10,position = "identity")+facet_wrap(~Immunogenicity)+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Stability\n(hrs)")
## Warning: Ignoring unknown parameters: bins
MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(`Thalf(h)`))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(`Thalf(h)`))

STAB_GBMTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="Thalf(h)",add="boxplot")+theme_pubr(base_size = 27)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

STAB_GBMTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=`Thalf(h)`, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+theme_pubr(base_size = 27)

STAB_GBMTESLA_ECDF_PLT=GBMSubset %>% select(`Thalf(h)`, Immunogenicity, Peptide) %>% mutate(Dataset="GBM")%>%rbind(
  TESLASubset %>% select(`Thalf(h)`, Immunogenicity,Peptide)%>% mutate(Dataset = "TESLA")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=`Thalf(h)`, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 14)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Stability\n(hrs)")

3.2.3 Fraction of hydrophobicity

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(HydroFraction))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(HydroFraction))

GBMTESL_FRACTIONHYDRO_BXPLT_COMPARE_CANCER=combinedDataset %>% ggboxplot(x="Immunogenicity",y="HydroFraction")+theme_pubr(base_size = 27)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

GBMTESL_FRACTIONHYDRO_BXPLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=HydroFraction, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic")+theme_pubr(base_size = 27)

3.2.4 Supplementary fig Binding stability and fraction hydrophobicity

plot_grid(STAB_GBMTESLA_VIOLIN_PLT+ylab("Binding Stability (hrs)"),STAB_GBMTESLA_BOX_PLT+ylab("Binding Stability (hrs)"), GBMTESL_FRACTIONHYDRO_BXPLT_COMPARE_CANCER, GBMTESL_FRACTIONHYDRO_BXPLT+theme(legend.position = "none"), nrow=2, ncol=2,align="hv",axis="bt", rel_widths = c(1.2,0.8,1.2,0.8))
## Warning: Removed 2 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_hline).